# conventional libraries
import os
import calendar
import pandas as pd
import numpy as np
# data visualizations
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
import seaborn as sns
import matplotlib.pyplot as plt
# algorithm
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import ranking_metrics_at_k
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("df_cleaned.csv")
df_negPQ = pd.read_csv("df_negqual.csv")
def add_value_labels(ax, spacing=5):
"""Add labels to the end of each bar in a bar chart.
Arguments:
ax (matplotlib.axes.Axes): The matplotlib object containing the axes
of the plot to annotate.
spacing (int): The distance between the labels and the bars.
"""
# For each bar: Place a label
for rect in ax.patches:
# Get X and Y placement of label from rect.
y_value = rect.get_height()
x_value = rect.get_x() + rect.get_width() / 2
# Number of points between bar and label. Change to your liking.
space = spacing
# Vertical alignment for positive values
va = 'bottom'
# If value of bar is negative: Place label below bar
if y_value < 0:
# Invert space to place label below
space *= -1
# Vertically align label at top
va = 'top'
# Use Y value as label and format number with one decimal place
label = "{:.1f}".format(y_value)
# Create annotation
ax.annotate(
label, # Use `label` as label
(x_value, y_value), # Place label at end of the bar
xytext=(0, space), # Vertically shift label by `space`
textcoords="offset points", # Interpret `xytext` as offset in points
ha='center', # Horizontally center label
va=va) # Vertically align label differently for
# positive and negative values.
temp = df[['CustomerID', 'InvoiceNo', 'Description']].groupby(['CustomerID', 'InvoiceNo', 'Description']).count()
temp = temp.reset_index(drop = False)
countries = temp['Description'].value_counts()
print('Number of products in ds: {}'.format(len(countries)))
temp = df[['CustomerID', 'InvoiceNo', 'Country']].groupby(['CustomerID', 'InvoiceNo', 'Country']).count()
temp = temp.reset_index(drop = False)
countries = temp['Country'].value_counts()
print('Number of countries in ds: {}'.format(len(countries)))
print('Number of unique customers in ds: {}'.format(df.CustomerID.nunique()))
color = sns.color_palette("mako")
sns.palplot(color)
df.groupby(['Description']).size().reset_index(name='counts').sort_values(by=['counts'],ascending=False).head()
items = df['Description'].value_counts().head()
item_counts = df['Description'].value_counts().sort_values(ascending=False).head(10)
sns.set_style("darkgrid")
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.barplot(y=item_counts.index, x=item_counts.values,palette = "mako")
plt.ylabel("Counts")
plt.title("Which items were bought more often?", fontsize=15,y=1.03)
plt.tight_layout()
plt.show()
#fig.savefig('eda1.png', transparent=True)
#Top 5 countries sales count wise in the cleaned up data.
df.Country.value_counts().head()
#Top 5 countries Total Gross Amount sales wise.
data_temp = df.groupby(['Country'])['TotalPrice'].agg('sum').reset_index().sort_values(by=['TotalPrice'],ascending=False).head()
print(data_temp)
sales_by_country = df.groupby(['Country']).TotalPrice.sum().sort_values(ascending = False).reset_index()
TotalSales = sales_by_country['TotalPrice'].sum()
sales_by_country['% of total sales'] = sales_by_country['TotalPrice']/TotalSales
#Top 5 countries by sales
sales_by_country.head()
import plotly.express as px
fig = px.pie(sales_by_country, values='% of total sales', names='Country',
title='Distribution of Sales per Country',
hover_data=['TotalPrice'], color_discrete_sequence=px.colors.sequential.GnBu_r)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
#exploratory statistics
plt.figure(figsize=(15,6))
sns.set_style("darkgrid")
temp = df.groupby(['Country'])['TotalPrice'].apply(np.sum)
plt.plot(temp, color=color[2]);
plt.ylim(0,1.1*max(temp.values))
plt.ylabel('Total revenue over year',fontsize=12,fontweight='bold');
plt.xlabel('Country',fontsize=12,fontweight='bold');
plt.xticks(temp.index,temp.index,rotation=90)
plt.show()
#Most Popular Product in UK
indexed=df.pivot_table(index=['Country','StockCode','Description'], values='Quantity', aggfunc='sum').reset_index()
plt.figure(figsize=(10,5))
sns.barplot(y='Description', x='Quantity', data=indexed[indexed['Country']=='United Kingdom'].sort_values(by='Quantity', ascending=False).head(10),
palette=color)
plt.title('Top 10 Most Popular Items in UK', fontsize=14,y=1.03)
plt.ylabel('Item')
From above plots and calculations we can see that vast majority of sales were made in UK and just 8.49% went abroad. We can say our dataset is skewed to the UK side. For the purpose of the analysis, the UK are exlcuded from the next visualizations.
group_country_amount_spent = df.groupby('Country')['TotalPrice'].sum().sort_values()
del group_country_amount_spent['United Kingdom']
# plot total money spent by each country (without UK)
sns.set_style("darkgrid")
plt.subplots(figsize=(15,8))
group_country_amount_spent.plot(kind='barh', fontsize=12, color=color[1])
plt.xlabel('Money Spent (Dollar)', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.title('Money Spent by different Countries (UK excluded)', fontsize=14,y=1.03)
plt.show()
group_country_orders = df.groupby('Country')['InvoiceNo'].count().sort_values()
del group_country_orders['United Kingdom']
# plot number of unique customers in each country (without UK)
plt.subplots(figsize=(15,8))
group_country_orders.plot(kind='barh', fontsize=12, color=color[2])
plt.xlabel('Number of Orders', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.title('Number of Orders for different Countries (UK Excluded)', fontsize=14,y=1.03)
plt.show()
Even though the Netherlands are only the fourth country in Amount of Orders with a little more than 2100 orders, they have the higher amount of sales
reg = df[df['TotalPrice']>=0].groupby('Country').agg({'TotalPrice':'sum',
'Quantity': 'sum',
'CustomerID': 'count'})
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(10,30))
g1 = sns.barplot(x=reg['TotalPrice'], y=reg.index, alpha=1, ax=ax[0],palette='summer_r', orient='h')
g2 = sns.barplot(x=reg['Quantity'], y=reg.index, alpha=1, ax=ax[1], palette='ocean',orient='h')
g3 = sns.barplot(x=reg['CustomerID'], y=reg.index, alpha=1, ax=ax[2], palette='mako_r', orient='h')
ax[2].title.set_text('Customers Count by Country')
ax[2].set_xlabel("Customers (Log Scale)")
ax[1].title.set_text('Quantity Sold by Country')
ax[1].set_xlabel("Quantity (Log Scale)")
ax[0].title.set_text('Revenue by Country')
ax[0].set_xlabel("Revenue (Log Scale)")
g1.set_xscale("log")
g2.set_xscale("log")
g3.set_xscale("log")
plt.show()
Since the given website is a United Kindom originated website. All the variables such as No. of Customers and the Gross total sales is dominated by the United Kingdom. The remaning portion is occupied by the neighbouring European Countries.
df.groupby('InvoiceNo')['Hour'].unique().value_counts().iloc[:-1].sort_index()
ax = df.groupby('InvoiceNo')['Hour'].unique().value_counts().iloc[:-1].sort_index().plot(kind='bar',
color=color[1],figsize=(15,6))
ax.set_xlabel('Hour',fontsize=12)
ax.set_ylabel('Number of Orders',fontsize=12)
ax.set_title('Number of orders for different Hours',fontsize=15, y=1.03)
ax.set_xticklabels(range(6,21), rotation='horizontal', fontsize=13)
plt.show()
hour_sales = df.groupby(['Hour'])['TotalPrice'].agg(['sum','mean'])
sns.set_style("darkgrid")
fig, axes = plt.subplots(1, 2, figsize=(18, 5))
axes = axes.flatten()
sns.barplot(x=hour_sales.index, y=hour_sales['sum'], ax=axes[0],
palette="mako").set_title("Total Revenue in a day")
plt.ylabel('a')
sns.barplot(x=hour_sales.index, y=hour_sales['mean'], ax=axes[1],
palette="mako").set_title("Average Revenue per Invoice in a day")
plt.show()
#hour_sales
'''Convert InvoicdeDate to datetime'''
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(15,10))
sns.set_style("darkgrid")
day = df[df['TotalPrice']>=0][['InvoiceDate','TotalPrice','Quantity']]
day = day.groupby(day['InvoiceDate'].dt.hour)[['TotalPrice','Quantity']].sum()
sns.barplot(data = day, x=day.index, y='TotalPrice', alpha=1, ax=ax[0], palette="mako_r")
sns.lineplot(data = day, x=day.index, y='Quantity', marker='o', sort = False, ax=ax[1])
fig.suptitle('Revenue and Quantity by Sale Hourwise',fontsize=15,y=0.92)
#add_value_labels(ax[0])
plt.show()
What time do people tend to buy our products? At 6 o'clock, people may want to return undesired stuff Starting from 7 am, people tend to make purchase on the online retail. As we can see the revenue hit the top at 12pm. Afterwards, sales gradually decrease till 18pm. After that only a few of customers left make purchases. Taking a look at the 2nd image, average revenue for an invoice at 7 am is substantially higher than the rest hours in a day. It suggests people make a huge quantity of items per transaction at the beginning of a day.
df.groupby('InvoiceNo')['weekday_nr'].unique().value_counts().sort_index()
ax = df.groupby('InvoiceNo')['weekday_nr'].unique().value_counts().sort_index().plot(kind='bar',color=color[1],figsize=(15,6))
ax.set_xlabel('Day',fontsize=12)
ax.set_ylabel('Number of Orders',fontsize=12)
ax.set_title('Number of orders for different Days',fontsize=15)
ax.set_xticklabels(('Mon','Tue','Wed','Thur','Fri','Sun'), rotation='horizontal', fontsize=13)
plt.show()
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(15,5))
sns.set_style("darkgrid")
week = df[df['TotalPrice']>=0][['InvoiceDate','TotalPrice','Quantity']]
week = week.groupby(week['InvoiceDate'].dt.weekday)[['TotalPrice','Quantity']].sum()
week = week.reset_index()
week['Week'] = week['InvoiceDate'].apply(lambda x: calendar.day_name[x])
sns.lineplot(data = week, x=week.Week, y='Quantity', marker='o', sort = False, ax=ax, color=color[0])
ax2 = ax.twinx()
sns.barplot(data = week, x=week.Week, y='TotalPrice', alpha=0.5, ax=ax2, palette='mako')
fig.suptitle('Revenue and Quantity by Sale Week Day Wise',fontsize=15)
add_value_labels(ax2)
plt.show()
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(15,10))
sns.set_style("darkgrid")
date = df[df['TotalPrice']>=0][['InvoiceDate','TotalPrice','Quantity']]
date = date.groupby(date['InvoiceDate'].dt.day)[['TotalPrice','Quantity']].sum()
sns.barplot(data = date, x=date.index, y='TotalPrice', alpha=1, ax=ax[0],
palette="mako")
sns.lineplot(data = date, x=date.index, y='Quantity', marker='o', sort = False, ax=ax[1], color=color[1])
fig.suptitle('Revenue and Quantity by Sale Daywise',fontsize=15,y=0.92)
plt.show()
Customers tend to buy more at the beginning of month
month_sales = df.groupby(['Month'])['TotalPrice'].agg(['sum','mean'])
sns.set_style("darkgrid")
fig, axes = plt.subplots(1, 2, figsize=(18, 5))
axes = axes.flatten()
sns.barplot(x=month_sales.index, y=month_sales['sum'], ax=axes[0],
palette="mako_r").set_title("Total Revenue over a year")
plt.ylabel('a')
sns.barplot(x=month_sales.index, y=month_sales['mean'], ax=axes[1],
palette="mako_r").set_title("Average Revenue over a year")
plt.show()
#month_sales
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(15,7))
sns.set_style("darkgrid")
q = df[(df['TotalPrice']>=0)&(df['InvoiceDate'].dt.year==2011)][['InvoiceDate','TotalPrice','Quantity']]
q = q.groupby(q['InvoiceDate'].dt.quarter)[['TotalPrice','Quantity']].sum()
sns.barplot(data = q, x=q.index, y='TotalPrice', alpha=0.7, ax=ax[0], palette = "ocean")
sns.lineplot(data = q, x=q.index, y='Quantity', marker='o', sort = False, ax=ax[1], color=color[1])
fig.suptitle('Revenue and Quantity by Sale Quarterly for 2011',fontsize=15)
add_value_labels(ax[0])
ax[1].set_xticklabels(['',1,'',2,'',3,'',4])
plt.show()
fig, ax = plt.subplots(nrows=4, ncols=1,figsize=(15,25))
rev = df[(df['TotalPrice']>0)&(df['InvoiceDate'].dt.year==2011)]
rev['Transactions Count'] = 1
rev = rev.groupby(rev['InvoiceDate'].dt.month).agg({'TotalPrice':'sum',
'Quantity': 'sum',
'CustomerID': 'count',
'Transactions Count':'sum'})
rev = rev.reset_index()
rev['Month'] = rev['InvoiceDate'].apply(lambda x: calendar.month_name[x])
rev = rev.rename({'TotalPrice':'Revenue'},axis=1)
cust = df.groupby('CustomerID').first().reset_index()[['CustomerID','InvoiceDate']]
cust = cust.groupby(cust.InvoiceDate.dt.month).agg({'CustomerID':'count'})
cust = cust.reset_index()
cust['Month'] = cust['InvoiceDate'].apply(lambda x: calendar.month_name[x])
sns.set_style("darkgrid")
sns.barplot(data=rev, x=rev.Month, y='Revenue', palette='summer_r', ax=ax[0])
ax[0].title.set_text('Revenue by Months')
add_value_labels(ax[0])
sns.barplot(data=rev, x=rev.Month, y='Quantity', palette='ocean', ax=ax[1])
ax[1].title.set_text('Quantity Sold by Months')
add_value_labels(ax[1])
sns.barplot(data=cust, x=cust.Month, y='CustomerID', palette='mako_r', ax=ax[2])
ax[2].title.set_text('New Customers by Months')
fig.suptitle('Growth Month wise',fontsize=16)
add_value_labels(ax[2])
sns.barplot(data=rev, x=rev.Month, y='Transactions Count', palette='gist_earth', ax=ax[3])
ax[3].title.set_text('Transactions Count by Months')
fig.suptitle('Growth Month wise',fontsize=16,y=0.90)
add_value_labels(ax[3])
plt.show()
Which month we sell out most and least? As we see, from January to August, the revenue makes a gradual increase from $560K to $700K. Towards the end of the year, sales make a huge jump to over a million and peak in November with $1461K However, looking at average revenue diagram indicates nothing change drastically.
sales_comp = df[(df['InvoiceDate'].dt.month==12)&(df['TotalPrice']>=0)][['InvoiceDate','TotalPrice','Quantity']]
sales_comp['Transactions Count'] = 1
sales_comp = sales_comp.groupby(sales_comp['InvoiceDate'].dt.year)[['TotalPrice','Quantity','Transactions Count']].sum()
fig, ax = plt.subplots(nrows=1, ncols=3,figsize=(20,5))
sns.set_style("whitegrid")
sns.barplot(data=sales_comp, x=sales_comp.index, y='TotalPrice', palette='summer', ax=ax[0])
ax[0].title.set_text('Revenue Comparision')
ax[0].set_ylabel('Revenue')
ax[0].set_xlabel('December of Year')
add_value_labels(ax[0])
sns.barplot(data=sales_comp, x=sales_comp.index, y='Quantity', palette='ocean', ax=ax[1])
ax[1].title.set_text('Quantity Sold Comparision')
add_value_labels(ax[1])
ax[1].set_xlabel('December of Year')
sns.barplot(data=sales_comp, x=sales_comp.index, y='Transactions Count', palette='mako_r', ax=ax[2])
ax[2].title.set_text('Transactions Count Comparision')
add_value_labels(ax[2])
ax[2].set_xlabel('December of Year')
fig.suptitle('Comparision for the month of December in 2010 and 2011',fontsize=16)
plt.show()
sns.set_style("darkgrid")
#exploratory statistics
plt.figure(figsize=(15,5))
temp = df.groupby(['Hour'])['TotalPrice'].apply(np.sum)
plt.plot(temp, color=color[3]);
plt.ylim(0,1.1*max(temp.values))
plt.ylabel('Total revenue over year',fontsize=12,fontweight='bold');
plt.xlabel('Hour',fontsize=12,fontweight='bold');
plt.title("Overview of the Total Revenue over the Years", fontsize=20, y=1.10)
plt.show()
#exploratory statistics
plt.figure(figsize=(15,5))
temp = df.groupby(['weekday'])['TotalPrice'].apply(np.sum)
plt.plot(temp,color=color[2]);
plt.ylim(0,1.1*max(temp.values))
plt.ylabel('Total revenue over year',fontsize=12,fontweight='bold');
plt.xlabel('Day of week',fontsize=12,fontweight='bold');
plt.show()
#exploratory statistics
plt.figure(figsize=(15,5))
temp = df.groupby(['Day'])['TotalPrice'].apply(np.sum)
plt.plot(temp,color=color[1]);
plt.ylim(0,1.1*max(temp.values))
plt.ylabel('Total revenue over year',fontsize=12,fontweight='bold');
plt.xlabel('Day of month',fontsize=12,fontweight='bold');
plt.show()
#exploratory statistics
plt.figure(figsize=(15,5))
temp = df.groupby(['Month'])['TotalPrice'].apply(np.sum)
plt.plot(temp,color=color[0])
plt.ylim(0,1.1*max(temp.values))
plt.ylabel('Total revenue over year',fontsize=12,fontweight='bold');
plt.xlabel('Month',fontsize=12,fontweight='bold');
plt.show()
Hourly sales are normally distributed and with mean 709k. There is no pattern in day wise sales in a month. There is no business on Saturday. Average sales are consistent from Monday to Friday Sales starts at 6 in the morning and ends at 8 PM. Peak hours of sales are between 10 AM to 3 PM Monday and Tuesday Sales starts at 7 AM and close at 6 PM Wednesday and Friday sales start at 7 AM and close at 8 PM Business hours for Sunday is between 9 to 4 PM
# Same as after ?
sales_by_date = df.groupby(by='Date')['Sales'].sum().reset_index()
fig = go.Figure(data=go.Scatter(x=sales_by_date.Date,y=sales_by_date.Sales
,line = dict(color="#4b778d", width=1.5)))
fig.update_layout(xaxis_title="Date",yaxis_title="Sales",title='Daily Sales',template='ggplot2')
fig.show()
fig, ax = plt.subplots(nrows=4, ncols=1,figsize=(15,20))
rev = df[df['TotalPrice']>=0]
rev['TransactionsCount'] = 1
rev = rev.groupby(rev['InvoiceDate'].dt.date).agg({'TotalPrice':'sum',
'Quantity': 'sum',
'CustomerID': 'count',
'TransactionsCount':'sum'})
rev['10 Days Moving Average Revenue'] = rev['TotalPrice'].rolling(10).mean()
rev['10 Days Moving Average Quantity'] = rev['Quantity'].rolling(10).mean()
rev['10 Days Moving Transactions Count'] = rev['TransactionsCount'].rolling(10).mean()
cust = df.groupby('CustomerID').first().reset_index()[['CustomerID','InvoiceDate']]
cust = cust.groupby(cust.InvoiceDate.dt.date).agg({'CustomerID':'count'})
cust['10 Days Moving Average Quantity'] = cust['CustomerID'].rolling(10).mean()
sns.set_style("darkgrid")
sns.lineplot(data=rev[['TotalPrice','10 Days Moving Average Revenue']], palette='summer_r', linewidth=1.5, ax=ax[0],legend=False)
ax[0].legend(title='Revenue Trends', loc='upper left', labels=['Revenue', '10 Days Moving Average Revenue'])
ax[0].title.set_text('Revenue Trends')
ax[0].set_xlabel('')
sns.lineplot(data=rev[['TotalPrice','10 Days Moving Average Quantity']], palette='ocean', linewidth=1.5, ax=ax[1])
ax[1].legend(title='Quantity Trends', loc='upper left', labels=['Quantity Sold', '10 Days Moving Average Quantity'])
ax[1].title.set_text('Quantity Sold Trends')
ax[1].set_xlabel('')
sns.lineplot(data=cust, palette='mako', linewidth=1.5, ax=ax[2])
ax[2].legend(title='New Customers Trends', loc='upper right', labels=['New Customers', '10 Days Moving Average New Customers'])
ax[2].title.set_text('New Customers Trends')
ax[2].set_xlabel('')
sns.lineplot(data=rev[['TransactionsCount','10 Days Moving Transactions Count']], palette='gist_earth', linewidth=1.5, ax=ax[3])
ax[3].legend(title='Transactions Count Trend', loc='upper left', labels=['Transactions Count', '10 Days Moving Average Transactions Count'])
ax[3].title.set_text('Transactions Count Trends')
ax[3].set_xlabel('')
plt.show()
'''Convert InvoicdeDate to datetime'''
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
'''Grouping the data based on months to get a feel of the monthly sales data'''
data_new = df.groupby(pd.Grouper(key='InvoiceDate',freq='M')).sum()
data_new.reset_index(level=0, inplace=True)
fig, axs = plt.subplots(2, 1, figsize=(20,12))
sns.barplot(data=data_new, x='InvoiceDate', y='TotalPrice', palette="mako", ax=axs[0])
axs[0].title.set_text("Total Amount of Sales per Months")
axs[0].set_xticklabels(data_new.InvoiceDate.astype(str))
sns.barplot(data=data_new, x='InvoiceDate', y='Quantity', palette="mako", ax=axs[1])
axs[1].title.set_text("Total Quantity of Orders per Months")
axs[1].set_xticklabels(data_new.InvoiceDate.astype(str))
plt.show()
Thus, we conclude that November 2011 resulted in the highest sales both by TotalAmount and Quantity, which might be caused by the early Christmas shopping. Comparing to February 2011 which has the worst in terms of sales. The last quarter of the year was best among the 2011 quarters.
#Average per Order in EUR (AOV)
df_wt_cust = df[(df['CustomerID'].notnull()) & (df['Quantity']>0)]
order_net = df_wt_cust.groupby(['InvoiceNo']).TotalPrice.sum()
aov = order_net.mean()
plt.figure(figsize=(15,6))
plt.hist(order_net,bins=1000, color=color[2])
plt.xlim(0,2000)
plt.xlabel("Order value")
plt.ylabel("Number of orders")
plt.title("Value of Orders", fontsize=15, y=1.05)
print("AOV : ",aov)
plt.show()
The aov is 480. Next I'm going to find the %repeat customers.
From the last part, we learned that there're refunded orders, so I'll calculated number of orders per customer by count total orders and subtract by total refund order by each customer
#Repeat Customers
#Exlude non-product rows
df_for_rpt_cust = df_wt_cust.copy()
cust_wt_total_order = df_for_rpt_cust[df_for_rpt_cust['TotalPrice']>0].groupby(['CustomerID']).InvoiceNo.nunique().reset_index()
cust_wt_total_order = cust_wt_total_order.rename(columns = {'InvoiceNo' : 'TotalOrder'})
cust_wt_total_refund_order = df_for_rpt_cust[df_for_rpt_cust['TotalPrice']<0].groupby(['CustomerID']).InvoiceNo.nunique().reset_index()
cust_wt_total_refund_order = cust_wt_total_refund_order.rename(columns = {'InvoiceNo' : 'TotalRefundOrder'})
join_cust_wt_total_order = cust_wt_total_order.merge(cust_wt_total_refund_order,left_on = 'CustomerID',right_on='CustomerID',how = 'left')
# convert null to 0
join_cust_wt_total_order['TotalRefundOrder'] = np.where(join_cust_wt_total_order['TotalRefundOrder'].isnull(),0,join_cust_wt_total_order['TotalRefundOrder'])
join_cust_wt_total_order['TotalSuccessOrder'] = join_cust_wt_total_order['TotalOrder']-join_cust_wt_total_order['TotalRefundOrder']
join_cust_wt_total_order['RepeatFlg'] = join_cust_wt_total_order['TotalSuccessOrder']>=2
CntCustomer = join_cust_wt_total_order.CustomerID.nunique()
CntRepeatCustomer = join_cust_wt_total_order[join_cust_wt_total_order['RepeatFlg']==True].CustomerID.nunique()
PctRepeatCustomer = CntRepeatCustomer/CntCustomer
print("%Repeat Customer : ",PctRepeatCustomer*100,"%")
plt.hist(join_cust_wt_total_order['TotalSuccessOrder'],bins = 100, color=color[3])
plt.xlim(0,30)
plt.xlabel("Number of Orders")
plt.ylabel("Number of Customers")
plt.show()
The percentage of repeated Customer is 65.5% which is pretty high and many customers keep coming back multiple times.
# remove outliers for Quantity
df_negPQ = df_negPQ[(df_negPQ['Quantity'] < 20000) & (df_negPQ['Quantity'] > -20000)]
Item_returned = df_negPQ[df_negPQ['Quantity'] < 0].groupby('CustomerID')[['TotalPrice',
'Quantity']].agg(['sum']).sort_values(by=[('Quantity', 'sum')], ascending=True).head(10)
sns.set_style("darkgrid")
plt.figure(figsize=(15,6))
sns.barplot(x=Item_returned.index, y=abs(Item_returned[('Quantity','sum')]), palette="mako")
plt.ylabel('A number of Quantity returned', fontsize = 12)
plt.xticks(rotation=65)
plt.title("Customers That Return The Most Items", fontsize=15, y=1.05)
plt.show()
Item_returned
plt.figure(figsize=(12,4))
sns.distplot(df[df['UnitPrice'] < 10]['UnitPrice'].values, kde=True, bins=10, color=color[1])
plt.title("Density of Products below 10£ Sold")
plt.show()
From these histograms we can see that vast majority of items sold in this store has low price range - 0 to 3 pounds.
plt.figure(figsize=(12,4))
sns.distplot(df[df['TotalPrice'] < 60]['TotalPrice'], kde=True, bins=10, color=color[2])
plt.title("Density of Total Price of Order")
plt.show()
From these histograms we can understand that majority of sales per order were in range 1-15 pounds each.
df_quantile_test = df.loc[((df['Quantity'] <= 15) & (df['Quantity'] > 0))]
plt.figure(figsize=(12,4))
sns.distplot(df_quantile_test['Quantity'], kde=True, bins=10, color=color[3])
plt.title("Density of Quantity Orders")
plt.show()
#& df_quantile['Quantity'] > 0
From these histograms we that people bought normally 1-5 items or 10-12
quant = df[["Quantity", "CustomerID", "InvoiceNo", "StockCode"]]
quant = quant.groupby('CustomerID')['Quantity'].agg(["sum"]).sort_values(by=[('sum')], ascending=False)
quant = quant.iloc[1:]
quant.reset_index(inplace=True)
quant.head()
quant['sum'].quantile([0.25,0.5,0.75])
customers = 0
wholesalers = 0
for i in range(len(quant)):
if quant['sum'].loc[i] > 989:
wholesalers = wholesalers + 1
else:
customers = customers +1
print(customers, wholesalers)
plt.figure(figsize=(25,7))
sns.boxplot(quant['sum'], color=color[1])
plt.title("Boxplot of Total Quantity per Customer", fontsize=15,y=1.03)
plt.show()
client = {'customers': [customers], 'wholesalers': [wholesalers]}
quant_ = pd.DataFrame(client)
sns.set_style("darkgrid")
plt.figure(figsize=(8,4))
sns.barplot(data=quant_, palette='ocean')
plt.title('Types of Customers', fontsize=15, y=1.05)
plt.show()
#returns cost 0.1%, so ignore first
data_TP_above0 = df[df['TotalPrice']>=0].reset_index(drop=True)
# are there any subgroups of important customer
temp = data_TP_above0.groupby(['CustomerID',])['TotalPrice'].apply(sum).reset_index(drop=False)
temp2 = temp.sort_values('TotalPrice').iloc[::-1].reset_index(drop=True)
result,val = [] ,0
for i in range(len(temp2)):
val += temp2.iloc[i]['TotalPrice']
result += [val,]
temp2['cummulative_profit'] = np.array(result)/val
for i in [0.01,0.1,1,2,5,10,20,50]:
num = int(len(temp2)*1.*i/100)
print (num,'Top %s percent of customers,%s percent of profit'%(i,100*np.round(temp2.loc[num]['cummulative_profit'],2)))
plt.figure(figsize=(15,6))
plt.plot(temp2['cummulative_profit'], color=color[2])
plt.xlabel('Customers ranked by revenue',fontsize=12,fontweight='bold')
plt.ylabel('Cummulative revenue',fontsize=12,fontweight='bold')
plt.show()
# are there any subgroups of important customer
temp = df.groupby(['StockCode_NR'])['TotalPrice'].apply(sum).reset_index(drop=False)
temp['totalRev_over_time'] = df.groupby(['StockCode_NR'])['TotalPrice'].apply(np.sum).reset_index(drop=True)
temp2 = temp.sort_values('totalRev_over_time').iloc[::-1].reset_index(drop=True)
result,val = [0,] ,0
for i in range(len(temp2)):
val += temp2.iloc[i]['TotalPrice']
result += [val,]
temp2['cummulative_profit'] = np.array(result[1:])/val
temp2['Total_profit'] = (np.array(result[1:])-np.array(result[:-1]))/val
for i in [0.01,0.1,1,2,5,10,20,50,60]:
num = int(len(temp2)*1.*i/100)
print (num,'Top %s percent of products,%s percent of profit'%(i,100*np.round(temp2.loc[num]['cummulative_profit'],2)))
plt.figure(figsize=(15,6))
plt.plot(temp2['cummulative_profit'], color=color[2])
x = 600 #coresponds to 97% products
#print (np.sum(temp2['totalRev_over_time']>x),'threshold=',x,',',np.round(np.mean(temp2['totalRev_over_time']>x),2),'of products account for',np.round(np.sum(temp2[temp2['totalRev_over_time']>x]['TotalRevenue'])/np.sum(temp2['TotalRevenue']),3),'of revenue\n\n')
temp3 = temp2[temp2['totalRev_over_time']>x]
data2 = pd.merge(df,temp3[['StockCode_NR','totalRev_over_time','Total_profit']]
,on='StockCode_NR',how='inner')
plt.xlabel('Products ranked by revenue',fontsize=12,fontweight='bold')
plt.ylabel('Cummulative revenue',fontsize=12,fontweight='bold')
plt.show()
# Calculate the number of new users, inactive users, return users and active users in each month
# Definition:
# New users: those who made their first purchase in the current month
# Active users: those who made purchases in the previous month and in the current month
# Inactive users: those who made purchases in previous months, but not in the current month
# Return users: those who made purchases before the previous month, not in the previous month and made purchases agian in the current month
df["InvoiceMonth"] = df["InvoiceDate"].apply(lambda x: x.replace(day=1))
df["InvoiceMonth"] = df["InvoiceMonth"].apply(lambda x: x.strftime("%Y-%m-%d"))
user_month_pivot = df.pivot_table(index=["CustomerID"],
columns=["InvoiceMonth"],
values=["InvoiceNo"],
aggfunc="count",
fill_value=0)
user_month_pivot.head()
# Replace count of invoices with 1
user_month_pivot = user_month_pivot.applymap(lambda x: 1 if x>0 else 0)
user_month_pivot.head()
# Define functions to get user status
def user_status(data):
status = []
for i in range(13):
# If the user has no purchase in the current month
if data[i] == 0:
# If the user has made purchases before
if len(status) > 0:
# If the user is unregistered in the previous month
if status[i-1] == "unreg":
# The the user is also unregistered this month
status.append("unreg")
# Otherwise the user is an active user, i.e., he/she already registered
else:
status.append("inactive")
# Otherwise the user is not registered in the current month, i.e., he/she has never made any purchases
else:
status.append("unreg")
else:
# This is the first purchase of the user
if len(status) == 0:
status.append("new")
else:
if status[i-1] == "inactive":
status.append("return")
elif status[i-1] == "unreg":
status.append("new")
else:
status.append("active")
return status
user_month_status = pd.DataFrame(user_month_pivot.apply(lambda x: pd.Series(user_status(x)), axis=1))
user_month_status.columns = user_month_pivot.columns
user_month_status.head()
month_status_pivot = pd.DataFrame(user_month_status.replace("unreg", np.NaN).apply(lambda x: pd.value_counts(x)))
month_status_pivot.head()
month_status_pivot = month_status_pivot.fillna(0).T
month_status_pivot.reset_index(inplace=True)
month_status_pivot.set_index("InvoiceMonth", inplace=True)
ax = month_status_pivot.plot.area(figsize = (15,7), color=color)
plt.title("Number of Users by Status in each month", fontsize=15, y=1.03)
plt.show()